iPSC splicing

Read in the splicing results returned by MAJIQ and make a volcano plot, only highlight genes of interest with a label.

junc_colors = c("#648FFF","#fe6101")
ipsc_splicing = fread("/Users/annaleigh/Documents/GitHub/unc13a_cryptic_splicing/data/ipsc_splicing_results.csv")


ipsc_splicing %>% 
  mutate(junction_name = glue::glue("{gene_name} - {paste_into_igv_junction}")) %>% 
  mutate(`Novel Junction` = de_novo_junctions == 0) %>% 
  mutate(log10_test_stat = -log10(1 - p_d_psi_0_10_per_lsv_junction)) %>% 
  mutate(log10_test_stat = ifelse(is.infinite(log10_test_stat), 16, log10_test_stat)) %>% 
  mutate(graph_alpha = ifelse(p_d_psi_0_10_per_lsv_junction > 0.9, 1, 0.2)) %>% 
  mutate(label_junction = case_when(gene_name %in% c("UNC13A",
                                                     "UNC13B","PFKP","SETD5",
                                                     "ATG4B","STMN2") & 
                                      p_d_psi_0_10_per_lsv_junction > 0.9  & 
                                      deltaPSI > 0 ~ junction_name,
                                    T ~ "")) %>% 
  ggplot(aes(x = deltaPSI, y = log10_test_stat)) +
  geom_point(aes(fill = `Novel Junction`,alpha = graph_alpha), pch = 21, size = 4) + 
  geom_text_repel(aes(label = label_junction), point.padding = 0.3,
                  min.segment.length = unit(0, 'lines')) +
  geom_hline(yintercept = -log10(1 - .9)) + 
  guides(alpha = FALSE, size = FALSE) + 
  scale_fill_manual(values = junc_colors) + 
  theme(legend.position = 'top') + 
  ggpubr::theme_pubr() + 
  xlab("delta PSI") + 
  ylab("-Log 10 Test Statistic") + 
  theme(text = element_text(size = 18))

UNC13A Cryptic PSI across TDP-43 KD experiments

We have 2 PSI’s for the UNC13A cryptic, one which includes both the short and long form of the cryptic, and one which does not. While we’re not mentioning in figure 1, given that the longer form of the cryptic appears in control cerebellum, one could argue that that junction should not be included in the PSI calculation here for UNC13A cryptic per se. In practice, this makes very little difference in the conclusions made on the cell lines, so I’ve included both for completeness.

The ‘C/G’ tells which genotypes were supported by RNA-seq on rs12973192. The NB cell lines are het, as is the WTC11 cell line. SH-SY5Y cells are homozygote for the major allele. There was variability on the Klim hMN set on allelic expression.

rel_rna_cryptic_amount = fread("/Users/annaleigh/Documents/GitHub/unc13a_cryptic_splicing/data/kd_experiments_relative_rna_and_unc13a_cryptic_junction_counts.csv")
rel_rna_cryptic_amount[,cryptic_psi_full := ( UNC13A_3prime + 
                                          UNC13A_5prime +  UNC13A_5prime_2 + 
                                          UNC13A_5prime_3) / (UNC13A_annotated +  UNC13A_3prime + 
                                          UNC13A_5prime +  UNC13A_5prime_2 + 
                                          UNC13A_5prime_3)]
rel_rna_cryptic_amount %>% 
    ggbarplot(,
              x = "source",
              add = c("mean_se","jitter"),
              y = "cryptic_psi_full",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8)) + 
    ggpubr::theme_pubr() +
    scale_fill_manual(
        values = c("#40B0A6","#E1BE6A")
    ) + 
    scale_color_manual(
        values = c("#1C2617","#262114")
    ) +
    scale_x_discrete(labels=c("shsy5y_normed" = "NB cells \n C/G", 
                              "ipsc_normed_ward" = "iPSC Neurons \n C/G",
                              "shsy5y_normed_no_snp" = "SH-SY5Y \n C/C",
                              "klim_normed" = "Klim iPSC MN \n C/G + C/C",
                              "sh_dzap" = "Appocher \n NB cells \n C/G")) +
    ylab("UNC13A Cryptic PSI") + 
    xlab("") + 
    guides(color = FALSE) 

 rel_rna_cryptic_amount %>% 
    ggbarplot(,
              x = "source",
              add = c("mean_se","jitter"),
              y = "cryptic_psi",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8)) + 
    ggpubr::theme_pubr() +
    scale_fill_manual(
        values = c("#40B0A6","#E1BE6A")
    ) + 
    scale_color_manual(
        values = c("#1C2617","#262114")
    ) +
    scale_x_discrete(labels=c("shsy5y_normed" = "NB cells \n C/G", 
                              "ipsc_normed_ward" = "iPSC Neurons \n C/G",
                              "shsy5y_normed_no_snp" = "SH-SY5Y \n C/C",
                              "klim_normed" = "Klim iPSC MN \n C/G + C/C",
                              "sh_dzap" = "Appocher \n NB cells \n C/G")) +
    ylab("UNC13A Cryptic PSI") + 
    xlab("") + 
    guides(color = FALSE) 

Paient summary statistics

First let’s look at only including tissues with detectable stmn2 or UNC13A CE

clean_data_table = fread(file.path(here(),"data","nygc_junction_information.csv"))
clean_data_table = clean_data_table %>%     
    mutate(call = fct_relevel(call,
                              "C/C", "C/G", "G/G")) %>% 
    mutate(number_g_alleles = as.numeric(call) - 1) %>% 
    mutate(unc13a_cryptic_leaf_psi = ifelse(is.na(unc13a_cryptic_leaf_psi),0,unc13a_cryptic_leaf_psi)) %>% 
    mutate(stmn2_psi_groups = as.numeric(cut_interval(log10(stmn_2_cryptic_psi_leaf), n = 2))) %>%
    mutate(stmn2_psi_groups = case_when(stmn2_psi_groups == 1 ~ " Low STMN2",
                                      stmn2_psi_groups == 2 ~ "High STMN2",
                                      TRUE ~ "No STMN2"))

print(glue::glue("Number of unique patients: {clean_data_table[,length(unique(participant_id))]}"))
## Number of unique patients: 377
print(glue::glue("Number of unique tissue samples: {clean_data_table[,length(unique(sample))]}"))
## Number of unique tissue samples: 1349
print("Patients Per Disease Category")
## [1] "Patients Per Disease Category"
clean_data_table[,length(unique(participant_id)),by = disease]
##    disease  V1
## 1: ALS-FTD  23
## 2:     ALS 193
## 3: Control  77
## 4:   Other  11
## 5:     FTD  61
## 6:  ALS-AD  12
print("Tissues Per Disease Category")
## [1] "Tissues Per Disease Category"
clean_data_table[,length(unique(sample)),by = disease]
##    disease  V1
## 1: ALS-FTD 110
## 2:     ALS 764
## 3: Control 199
## 4:   Other  70
## 5:     FTD 138
## 6:  ALS-AD  68
print("Number of patients per rs12973192 genotype")
## [1] "Number of patients per rs12973192 genotype"
clean_data_table[,length(unique(participant_id)),by = call]
##    call  V1
## 1:  C/C 166
## 2:  G/G  58
## 3:  C/G 153
print("Number of tissues per disease")
## [1] "Number of tissues per disease"
clean_data_table[,.N,by = c("disease","tissue_clean")]
##     disease         tissue_clean   N
##  1: ALS-FTD       Frontal_Cortex  22
##  2:     ALS       Frontal_Cortex 132
##  3: Control       Frontal_Cortex  40
##  4:   Other       Frontal_Cortex  11
##  5: ALS-FTD   Lumbar_Spinal_Cord  15
##  6:     ALS   Lumbar_Spinal_Cord 105
##  7: Control   Lumbar_Spinal_Cord  33
##  8:   Other   Lumbar_Spinal_Cord   9
##  9: ALS-FTD Cervical_Spinal_Cord  14
## 10:     ALS Cervical_Spinal_Cord 103
## 11: Control Cervical_Spinal_Cord  32
## 12:   Other Cervical_Spinal_Cord  10
## 13: ALS-FTD         Motor_Cortex  28
## 14:     ALS         Motor_Cortex 175
## 15: Control         Motor_Cortex  23
## 16:   Other         Motor_Cortex  16
## 17: ALS-FTD           Cerebellum  13
## 18:     ALS           Cerebellum 129
## 19: Control           Cerebellum  28
## 20:   Other           Cerebellum   8
## 21:     FTD           Cerebellum  58
## 22:     FTD       Frontal_Cortex  45
## 23:  ALS-AD           Cerebellum  11
## 24:  ALS-AD         Motor_Cortex  13
## 25:  ALS-AD Cervical_Spinal_Cord  10
## 26:  ALS-AD   Lumbar_Spinal_Cord  11
## 27:  ALS-AD       Frontal_Cortex  12
## 28:  ALS-AD     Occipital_Cortex   7
## 29:  ALS-AD Thoracic_Spinal_Cord   4
## 30:     ALS     Occipital_Cortex  37
## 31:     ALS Thoracic_Spinal_Cord  33
## 32: ALS-FTD     Occipital_Cortex   6
## 33: Control      Temporal_Cortex  23
## 34:     ALS      Temporal_Cortex  23
## 35:     FTD      Temporal_Cortex  35
## 36:   Other     Occipital_Cortex   7
## 37:   Other Thoracic_Spinal_Cord   6
## 38: Control Thoracic_Spinal_Cord   5
## 39: Control     Occipital_Cortex   5
## 40: ALS-FTD Thoracic_Spinal_Cord   5
## 41:     ALS          Hippocampus  27
## 42: ALS-FTD          Hippocampus   7
## 43:   Other          Hippocampus   3
## 44: Control          Hippocampus  10
##     disease         tissue_clean   N
print("Number of partcipants by mutation and  disease")
## [1] "Number of partcipants by mutation and  disease"
clean_data_table[,length(unique(participant_id)),by = c("mutations","disease")]
##     mutations disease  V1
##  1:      None ALS-FTD  13
##  2:      None     ALS 145
##  3:   C9orf72 ALS-FTD  10
##  4:      None Control  77
##  5:      None   Other  11
##  6:      SOD1     ALS   8
##  7:      OPTN     ALS   4
##  8:   C9orf72     ALS  32
##  9:     MATR3     ALS   1
## 10:       ANG     ALS   1
## 11:   C9orf72     FTD  12
## 12:      None  ALS-AD  11
## 13:      None     FTD  42
## 14:   C9orf72  ALS-AD   1
## 15:      TBK1     FTD   2
## 16:      MAPT     FTD   5
## 17:       FUS     ALS   2
print(glue::glue("Number of patients per pathology:"))
## Number of patients per pathology:
clean_data_table[,length(unique(participant_id)),by = .(pathology)]
##     pathology  V1
##  1:   ALS-FTD  23
##  2:       ALS 193
##  3:   control  77
##  4:     Other  11
##  5:            13
##  6:    ALS-AD  12
##  7: FTD-TDP-A  24
##  8: FTD-TDP-B   3
##  9: FTD-TDP-C   9
## 10:   FTD-TAU   7
## 11:   FTD-FUS   5

UNC13A cryptic is an event that is specific to TDP-43 proteinopathy

FTLD-non-TDP are those with TAU and FUS aggregates

Non-tdp ALS are those with SOD1 or FUS mutations. The n’s are quite low on this unfortunately, only 8 ALS with SOD1 and 2 with FUS mutations.

First we look at detection rate in tissues affected by TDP-43 proteinopathy, For FTLD this is frontal and temporal Cortices, and for ALS this is lumbar, cervical, and thoracic spinal cord samples as well as motor cortex. For controls we also take all 6 tissues, frontal,temporal,motor cortices and the lumbar, cervical, and thoracic spinal cords.

(As a side note, once we do this the number of ALS-non-TDP drops down to 6 (2 FUS) because the ALS sample tissues are not balanced and not every participant has samples in every tissue)

####Inclusion reads by if TDP-potential####
boxplot_table = clean_data_table %>% 
    mutate(disease_group2 = case_when(disease == "Control" ~ "Control",
                                 pathology %in% c("FTD-TDP-A","FTD-TDP-B","FTD-TDP-C") ~ "FTLD-TDP",
                                 pathology %in% c("FTD-TAU","FTD-FUS") ~ "FTLD \n non-TDP",
                                 mutations %in% c("SOD1","FUS") ~ "ALS \n non-TDP",
                                 T ~ "ALS-TDP")) %>% 
    mutate(across(UNC13A_3prime_leaf:UNC13A_annotated_leaf, ~ .x / library_size,.names = "{.col}_library_norm")) %>% 
    filter(!tissue_clean %in% c("Choroid","Liver")) %>% 
    dplyr::select(sample,participant_id,mutations,disease_group2,pathology,tissue_clean,contains("_library_norm")) %>% 
    melt() %>% 
    filter(grepl("_3prime|_5prime_",variable)) %>% 
    group_by(sample) %>% 
    mutate(inclusion_reads = sum(value)) %>% 
    ungroup() %>% 
    unique() %>% 
    mutate(junction_name = case_when(variable == "UNC13A_3prime_leaf_library_norm" ~ "  Novel 3'", 
                                 variable == "UNC13A_5prime_1_leaf_library_norm" ~ " Short Novel 5'", 
                                 variable == "UNC13A_5prime_2_leaf_library_norm"~ "Long Novel 5'")) %>% 
    mutate(disease_tissue = case_when((grepl("FTLD",disease_group2) & grepl("Cortex",tissue_clean))  ~ T,
                                      (grepl("ALS",disease_group2) & grepl("Cord|Motor",tissue_clean))  ~ T,
                                      (grepl("Occipital",tissue_clean)) ~ F,
                                      (grepl("Control",disease_group2) & grepl("Cord|Cortex",tissue_clean)) ~ T,
           TRUE ~ F)) %>% 
    mutate(tissue_clean = gsub("_"," ",tissue_clean))

Looking at disease tissue only, so just taking the cords in ALS and the frontal and temporal cortex of FTLD and then the cord and cortices in Controls.

boxplot_table %>% 
  filter(disease_tissue == T) %>% 
  mutate(detected = inclusion_reads > 0) %>% 
  dplyr::select(participant_id,disease_group2,detected) %>% 
  unique() %>% 
  group_by(disease_group2) %>% 
  mutate(n_sample = n_distinct(participant_id)) %>% 
  mutate(n_sample_detected = sum(detected)) %>% 
  dplyr::select(disease_group2,n_sample,n_sample_detected) %>% 
  unique() %>% 
  mutate(detection_rate = n_sample_detected / n_sample) %>% 
  mutate(disease_group2 = gsub("Control"," Control",disease_group2)) %>% 
  mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )")) %>% 
  ggplot() +
  geom_col(aes(x = detection_name, y = detection_rate)) + 
  ggpubr::theme_pubr() + 
  scale_y_continuous(lim = c(0,1),labels = scales::percent) + 
  ylab("Percent of Patients \n UNC13A Cryptic Detected") + 
  theme(text = element_text(size = 18)) + 
  xlab("N individuals")

disease_comparisons = list( c("Control","ALS-TDP"),
                           c("Control","ALS \n non-TDP"),
                           c("Control","FTLD-TDP"),
                           c("Control","FTLD \n non-TDP" ))

table_to_test = boxplot_table %>% 
    filter(disease_tissue == T) %>% 
    group_by(disease_group2) %>% 
    mutate(n_sample = n_distinct(sample)) %>% 
    mutate(disease_group2 = gsub("Control"," Control",disease_group2)) %>% 
    mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )")) %>% 
    dplyr::select(detection_name,
                  participant_id,
                  inclusion_reads,
                  disease_group2) %>% 
    unique() 

test_pair = pairwise.wilcox.test(table_to_test$inclusion_reads, table_to_test$detection_name,
                     p.adjust.method = "BH") %>% broom::tidy()

test_pair = test_pair %>% 
  mutate(p_value_draw = case_when(p.value < 0.0001~ "***",
                                  p.value < 0.01 ~ "**",
                                   p.value < 0.05 ~ "*",
                          TRUE ~ paste0("Adj. p-value \n",as.character(round(p.value,2))))) %>% 
  mutate(y.position = seq(0.25,by = 0.1,length.out = 9))

table_to_test %>% 
    ggplot(aes(x = detection_name, y = inclusion_reads * 10^6)) + 
    geom_boxplot() + 
    geom_jitter(height = 0) + 
    scale_y_log10() + 
    ggpubr::theme_pubr() +
    ylab("UNC13A cryptic inclusion \n reads per million") + 
    theme(text = element_text(size = 18)) + 
    xlab("N samples") + 
    stat_pvalue_manual(test_pair %>% filter(p.value < 0.05),
                       label = "p_value_draw") +
   stat_compare_means()

####Inclusion reads by if TDP-potential####
boxplot_table %>% 
  filter(disease_group2 %in% c("Control",
                              "FTLD \n non-TDP",
                              "FTLD-TDP")) %>% 
    filter(tissue_clean %in% c("Cerebellum","Frontal Cortex","Temporal Cortex")) %>% 
  dplyr::select(disease_group2,
                tissue_clean,
                junction_name,
                value) %>% 
  unique() %>% 
    ggplot(aes(x = disease_group2, 
               y = value * 10^6,
               fill = junction_name)) + 
    geom_boxplot(show.legend = F) + 
    geom_jitter(height = 0,show.legend = F) + 
    scale_y_log10() + 
    ggpubr::theme_pubr() +
    facet_grid(vars(junction_name),vars(tissue_clean)) + 
    ylab("UNC13A cryptic \n reads per million") +
    theme(text = element_text(size = 18)) + 
    xlab("") +
    scale_fill_manual(values = colorblind_pal()(4)[2:4]) 

Splitting by the type of junction in ALS

####Inclusion reads by if TDP-potential####
boxplot_table %>% 
  filter(disease_group2 %in% c("Control",
                              "ALS \n non-TDP",
                              "ALS-TDP")) %>% 
      filter(tissue_clean %in% c("Cerebellum","Motor Cortex","Temporal Cortex") | grepl("Cord",tissue_clean)) %>% 
  dplyr::select(disease_group2,
                tissue_clean,
                junction_name,
                value) %>% 
  unique() %>% 
  mutate(disease_group2 = fct_relevel(disease_group2,"Control","ALS \n non-TDP","ALS-TDP")) %>% 
    ggplot(aes(x = disease_group2, 
               y = value * 10^6,
               fill = junction_name)) + 
    geom_boxplot(show.legend = F) + 
    geom_jitter(height = 0,show.legend = F) + 
    scale_y_log10() + 
    ggpubr::theme_pubr() +
    facet_grid(vars(junction_name),vars(tissue_clean)) + 
    ylab("UNC13A cryptic \n reads per million") +
    theme(text = element_text(size = 18)) + 
    xlab("") +
    scale_fill_manual(values = colorblind_pal()(4)[2:4]) +
  theme(axis.text.x =  element_text(size = 14))

clean_data_table %>% 
    mutate(disease_group2 = case_when(disease == "Control" ~ "Control",
                                 pathology %in% c("FTD-TDP-A","FTD-TDP-B","FTD-TDP-C") ~ "FTLD-TDP",
                                 pathology %in% c("FTLD-TAU","FTLD-FUS") ~ "FTLD-non-TDP",
                                 mutations %in% c("SOD1","FUS") ~ "ALS \n non-TDP",
                                 T ~ "ALS-TDP")) %>% 
    filter(disease_group2 %in% c("ALS \n non-TDP","ALS-TDP")) %>% 
    mutate(across(UNC13A_3prime_leaf:UNC13A_annotated_leaf, ~ .x / library_size,.names = "{.col}_library_norm")) %>% 
    filter(!tissue_clean %in% c("Choroid","Liver")) %>% 
    dplyr::select(sample,disease_group2,tissue_clean,contains("_library_norm")) %>% 
    melt() %>% 
    filter(grepl("_3prime|_5prime_1",variable)) %>% 
    group_by(sample) %>% 
    mutate(inclusion_reads = sum(value)) %>% 
    ungroup() %>% 
    dplyr::select(-variable,-value) %>% 
    unique() %>% 
    ggplot(aes(x = disease_group2, y = inclusion_reads * 10^6)) + 
    geom_boxplot() + 
    geom_jitter(height = 0) + 
    facet_wrap(~tissue_clean,nrow = 2) + 
    scale_y_continuous(trans = 'log10') + 
    ggpubr::theme_pubr() +
    ylab("UNC13A cryptic inclusion reads per million")  

Relationship between UNC13A and STMN2 cryptic PSI

We noted in the cell lines that there was a linear relationship between the amount of TARDBP KD efficiency and the amount of observed UNC13A cryptic. As a quantitative measure of TDP-43 proteinopathy does not exist for these patients, no ranked staining, we used the amount of STMN2 present in a tissue to rank the tissue by the amount of TDP-43 proteinopathy, as we’ve already shown you can use that as a measure. Here we show the relationship in samples with at least 10 STMN2 annotated junctions and tissues where the PSI of UNC13A cryptic was also detectable, as measured by STMN2 cryptic occurring in the tissue.

Scatter plot showing the relationship in non-logspace between STMN2 and UNC13A cryptic PSI

####scatter plot showing the correlation in non-log space in STMN2 and cryptic PSI####
clean_data_table %>% 
    filter(STMN2_annotated_leaf > 10) %>% 
    filter(stmn_2_cryptic_psi_leaf > 0 ) %>% 
    filter(unc13a_cryptic_leaf_psi > 0 ) %>%
    group_by(call) %>% 
    ggpubr::ggscatter(., 
                      x = "stmn_2_cryptic_psi_leaf",
                      y = "unc13a_cryptic_leaf_psi",
                      add = "reg.line",
                      cor.coef = T) +
    ylab("UNC13A Cryptic PSI ") + 
    xlab("STMN2 Cryptic PSI ") 
## `geom_smooth()` using formula 'y ~ x'

####scatter plot showing the correlation in non-log space in STMN2 and cryptic PSI####
clean_data_table %>% 
    filter(STMN2_annotated_leaf > 10) %>% 
    filter(stmn_2_cryptic_psi_leaf > 0 ) %>% 
    filter(unc13a_cryptic_leaf_psi > 0 ) %>%
    mutate(call = fct_relevel(call,"C/C", "C/G", "G/G")) %>%
    group_by(call) %>% 
    ggpubr::ggscatter(., 
                      x = "stmn_2_cryptic_psi_leaf",
                      y = "unc13a_cryptic_leaf_psi",
                      add = "reg.line",
                      color = "call") +
    ylab("UNC13A Cryptic PSI ") + 
    xlab("STMN2 Cryptic PSI ") +
    stat_cor(aes(color = call),show.legend = FALSE) +
    scale_color_manual(values = c("#88CCEE","#44AA99","#105e2a"))
## `geom_smooth()` using formula 'y ~ x'

You can see a clear relationship, which appears to be modulated by the genotype of rs12973192. This lead us to see if genotype was actually predictive of the amount of UNC13A cryptic psi. Because PSI’s tend to be log linear, we chose to model UNC13A in the log10 + 1 (log10(1) is zero) and we selected for tissues which show TDP-43 pathology, as we assume this is an event which only begins to show clear after TDP-43 pathology arises.

Scatter plot showing the relationship in log + 1 between STMN2 and UNC13A cryptic PSI

clean_data_table %>% 
  filter(disease_tissue == T) %>% 
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>% 
    mutate(call = fct_relevel(call,
                              "C/C", "C/G", "G/G")) %>%
    filter(STMN2_annotated_leaf > 10) %>% 
    filter(stmn_2_cryptic_psi_leaf > 0 ) %>% 
    filter(unc13a_cryptic_leaf_psi > 0 ) %>% 
    mutate(log10_stmn_2_cryptic_psi_leaf_plusone = log10(stmn_2_cryptic_psi_leaf + 1)) %>% 
    mutate(log10_unc13a_cryptic_leaf_psi_plusone = log10(unc13a_cryptic_leaf_psi + 1)) %>% 
    group_by(call) %>% 
    ggpubr::ggscatter(., 
                      x = "log10_stmn_2_cryptic_psi_leaf_plusone",
                      y = "log10_unc13a_cryptic_leaf_psi_plusone",
                      color = 'call',fill = 'call',                      
                      add = "reg.line") +
    ylab("Log10 UNC13A Cryptic PSI + 1") + 
    xlab("Log10 STMN2 Cryptic PSI + 1 ") +
    scale_color_manual(values = c("#88CCEE","#44AA99","#105e2a")) + 
    scale_fill_manual(values = c("#88CCEE","#44AA99","#105e2a"))
## `geom_smooth()` using formula 'y ~ x'

We can perform this analysis separated by tissue, but low numbers make the results more complicated to interpret. Additionally, the tissue only matters in so much as we know that certain tissues in certain diseases are affected by TDP-43 pathology, whereas others aren’t, tissue is not the most interesting or important variable, TDP-43 pathology is the question.

####scatter plot showing the correlation in non-log space in STMN2 and cryptic PSI####
clean_data_table %>% 
    filter(tissue_clean%in% c("Cervical_Spinal_Cord",
                              "Motor_Cortex",
                              "Frontal_Cortex","Temporal_Cortex")) %>% 
    mutate(call = fct_relevel(call,
                              "C/C", "C/G", "G/G")) %>%
    filter(STMN2_annotated_leaf > 10) %>% 
    filter(cryptic_psi > 0 ) %>% 
    filter(stmn_2_cryptic_psi > 0 ) %>% 
    group_by(call) %>% 
    ggpubr::ggscatter(., 
                      x = "stmn_2_cryptic_psi",
                      y = "unc13a_cryptic_leaf_psi",
                      color = 'call',                      
                      add = "reg.line") +
    ylab("UNC13A Cryptic PSI ") + 
    xlab("STMN2 Cryptic PSI ") +
    scale_color_manual(values = c("#88CCEE","#44AA99","#105e2a")) +
    facet_wrap(~tissue_clean)
## `geom_smooth()` using formula 'y ~ x'

Modeling UNC13A cryptic using glms and mixed models

This brought us to the idea that we could even potentially model UNC13A cryptic pathology using genotype and the amount of TDP-43 pathology. We’ve coded the number of G alleles here as a number and are looking into tissue with TDP-43 pathology(measure with STMN2 cryptic PSI).

new_regress = clean_data_table %>%
    mutate(call = fct_relevel(call,
                              "C/C", "C/G", "G/G")) %>%
    filter(STMN2_annotated_leaf > 10) %>% 
    filter(stmn_2_cryptic_psi_leaf > 0 ) %>%  
    mutate(log10_stmn_2_cryptic_psi_leaf = log10(stmn_2_cryptic_psi_leaf)) %>% 
    mutate(log10_unc13a_cryptic_leaf_psi_plusone = log10(unc13a_cryptic_leaf_psi + 1)) %>% 
    mutate(log10_stmn_2_cryptic_psi_leaf_plusone = log10(stmn_2_cryptic_psi_leaf + 1))


another_model2 = lm(log10(unc13a_cryptic_leaf_psi + 1) ~ 
                   log10(stmn_2_cryptic_psi_leaf + 1) * number_g_alleles + pathology + tissue_clean,
              new_regress)


the_pretty_plot = pretty_effect_plot(another_model2,p_value_cutoff = 1,"UNC13A Cryptic PSI") + ggpubr::theme_pubr()
print(the_pretty_plot)

We find that at a cutoff of 0.01 for significant effects, only the interaction between the number of G alleles and the amount of STMN2 pathology is predictive of UNC13A cryptic inclusion. the absolute amount of STMN2 pathology does not vary between genotypes.

genotype_comparisons = list(c("C/C", "C/G"), c("C/C", "G/G"), c("C/G", "G/G"))


ggboxplot(new_regress, x = "call", y = "log10_stmn_2_cryptic_psi_leaf",
          color = "call", palette = "jco")+
  stat_compare_means() +
  stat_compare_means(comparisons = genotype_comparisons)

ggboxplot(new_regress, x = "call", y = "stmn_2_cryptic_psi_leaf",
          color = "call", palette = "jco")+
  stat_compare_means() +
  stat_compare_means(comparisons = genotype_comparisons)

new_regress %>% 
    ggplot() + 
    stat_ecdf(aes(color = call, x = log10_stmn_2_cryptic_psi_leaf))

We see that detection rate by genotype varies.

overall_fisher = clean_data_table %>%
    filter(disease_tissue == T) %>% 
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>% 
  mutate(unc13a_detected = unc13a_cryptic_leaf_psi > 0) %>% 
  dplyr::select(participant_id,unc13a_detected,call) %>% 
  unique() %>% 
  group_by(call) %>% 
  mutate(n_sample = n_distinct(participant_id)) %>% 
  mutate(n_sample_detected = sum(unc13a_detected)) %>% 
  dplyr::select(call,n_sample,n_sample_detected) %>% 
  unique() %>% 
  mutate(n_non_detected = n_sample - n_sample_detected) %>% 
  dplyr::select(-n_sample)


over_p = overall_fisher %>% 
  column_to_rownames('call') %>% 
  fisher.test() %>% 
  broom::tidy() %>% 
  .$p.value

overall_fisher %>% 
  mutate(n_sample = n_non_detected + n_sample_detected) %>% 
  mutate(detection_rate = n_sample_detected / n_sample) %>% 
  mutate(detection_name = glue::glue("{call} \n ( {n_sample} )")) %>% 
  ggplot(aes(x = detection_name, y = detection_rate, fill = detection_name)) +
  geom_col(show.legend = F) + 
  ggpubr::theme_pubr() + 
  scale_y_continuous(labels = scales::percent) + 
  ylab("% of TDP-43 Proteionopathy Patients \n UNC13A Cryptic Detected") + 
  theme(text = element_text(size = 18)) + 
  xlab("N individuals") + 
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) 

Although this difference is not significant, with the Fisher’s exact giving a p-value of 0.28.

Only looking at disease relevant tissue and in patients

detection_table = clean_data_table %>% 
  filter(disease_tissue == T) %>% 
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>% 
    mutate(call = fct_relevel(call,
                              "C/C", "C/G", "G/G")) %>%
    mutate(stmn2_psi_groups = as.numeric(cut_interval(log10(stmn_2_cryptic_psi), n = 2))) %>%
    mutate(stmn2_psi_groups = ifelse(is.na(stmn2_psi_groups),"  No STMN2",stmn2_psi_groups)) %>%
    mutate(stmn2_psi_groups = case_when(stmn2_psi_groups == 1 ~ " Low STMN2",
                                        stmn2_psi_groups == 2 ~ "High STMN2",
                                        TRUE ~ stmn2_psi_groups)) %>%
  mutate(unc13a_detected = unc13a_cryptic_leaf_psi > 0) %>% 
  group_by(call,unc13a_detected,stmn2_psi_groups) %>% 
  add_count(name = "genotype_detected") %>% 
  dplyr::select(call,unc13a_detected,genotype_detected,stmn2_psi_groups) %>% 
  unique() %>% 
  pivot_wider(names_from = "unc13a_detected",
              values_from = "genotype_detected") %>% 
  dplyr::rename(unc_not_detected = `FALSE`, unc_cryptic_detected = `TRUE`) %>%
  mutate(total_tissue = (unc_not_detected + unc_cryptic_detected)) %>% 
  mutate(detection_rate = unc_cryptic_detected / total_tissue) %>% 
  ungroup() %>% 
  mutate(stmn2_psi_groups = fct_relevel(stmn2_psi_groups, "No STMN2", after = 0)) %>% 
  mutate(detection_name = glue::glue("{call} \n ( {total_tissue} )"))



detection_table %>% 
  ggplot(aes(x = call, y = detection_rate,fill = call)) + 
  geom_col(show.legend = F,position = 'dodge2') + 
  facet_wrap(~stmn2_psi_groups) +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) + 
  scale_y_continuous(lim = c(0,0.5),labels = scales::percent) + 
  ylab("Percent of Samples \n UNC13A Cryptic Detected") + 
  theme(text = element_text(size = 18))  +
  ggpubr::theme_pubr() + 
  geom_text(aes(label = total_tissue,y = 0),vjust = 1,size = 6) + 
  theme(text = element_text(size = 18)) +
  ggpubr::theme_pubr() +
  xlab("N Tissues") 

UNC13A cryptic PSI

clean_data_table %>%   
  filter(disease_tissue == T) %>% 
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>% 
    mutate(call = fct_relevel(call,
                              "C/C", "C/G", "G/G")) %>%
    filter(stmn2_annotated > 10) %>% 
    filter(cryptic_psi > 0) %>% 
    ggbarplot( x = "call", 
               y = "cryptic_psi", 
               color = 'call',
              position = position_dodge(0.8),
              add = c("mean_se","jitter")) + 
    ylab("UNC13A Cryptic PSI") + 
    scale_color_manual(values = c("#88CCEE","#44AA99","#105e2a")) +
    theme(text = element_text(size = 20)) + 
    stat_compare_means(aes(group = call), label = "p.format") + 
    stat_compare_means(aes(group = call),comparisons = genotype_comparisons,label = "p.signif")

clean_data_table %>%   
  filter(disease_tissue == T) %>% 
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>% 
    mutate(call = fct_relevel(call,
                              "C/C", "C/G", "G/G")) %>%
    mutate(stmn2_psi_groups = as.numeric(cut_interval(log10(stmn_2_cryptic_psi), n = 2))) %>%
    mutate(stmn2_psi_groups = ifelse(is.na(stmn2_psi_groups),"  No STMN2",stmn2_psi_groups)) %>%
    mutate(stmn2_psi_groups = case_when(stmn2_psi_groups == 1 ~ " Low STMN2",
                                        stmn2_psi_groups == 2 ~ "High STMN2",
                                        TRUE ~ stmn2_psi_groups)) %>%
    filter(cryptic_psi > 0) %>%
    ggbarplot( x = "call", 
               y = "cryptic_psi", 
               color = 'call',
               facet.by = 'stmn2_psi_groups',
              position = position_dodge(0.8),
              add = c("mean_se","jitter")) + 
    ylab("UNC13A Cryptic PSI") + 
    scale_color_manual(values = c("#88CCEE","#44AA99","#105e2a")) +
    theme(text = element_text(size = 20)) + 
    stat_compare_means(aes(group = call), label = "p.format") + 
    stat_compare_means(aes(group = call),comparisons = genotype_comparisons,label = "p.signif")  + 
  xlab("")